library(ape)
#install.packages("phangorn")
library(phangorn)Dicaeum gene tree coalescent heights investigation
0.1 load libraries
0.2 Info on function inputs
Here we will be using a single function to perform the gene tree topology sorting and coalescent height retrieval. The exact details of the required inputs for this function are explained below:
#################
#required inputs#
#################
#use 'sl_aln' to specify a set of single locus fasta alignments read into the working directory as a list of objects each of class 'DNAbin'. Example code for how to create an appropriate input for this variable using a for loop around the ape function 'read.dna()' is shown below in the section called "Bring in alignments and create the 'sl_aln' variable".
#use 'popmap' to specify a dataframe that gives each sample ID and its corresponding population for the alignments specified as 'sl_aln'.
#Must be a dataframe with two columns, first a character vector named 'id', and second a character vector named 'pops' with levels = c('P1','P2','P3','OG')
#e.g.,
#id | pop
#---------------------
#D_hypo_123 | P1
#D_hypo_158 | P2
#D_hypo_199 | P3
#D_ni_555 | OG
#use 'quartet.samps' to designate the number of random quartets you want sampled (must be an integer)
#e.g., quartet.samps=1
#use 'model' to designate the sequence evolution model you want to use (options are 'raw', 'F84', or 'JC69'):
#e.g., model="raw"
#use 'treebuilder' to designate the approach you want to use for building gene trees (options are 'nj' or 'upgma')
#e.g., tree.builder="nj"
#################
#optional inputs#
#################
#use 'out.table' to specify the full output path and filename where you would like the results table saved (must be .txt extension)
#e.g., out.table="~/Desktop/coal.heights.txt"0.3 Bring in alignments and create the ‘sl_aln’ variable
‘sl_aln’ is a list of single-locus fasta alignments. I use the code below to read them into R working memory and store them in a single sequential list object.
#Designate the directory where each of your single-locus fasta alignments are stored (must end with /)
sl_aln_dir<-"~/Desktop/phil.dicaeum/astral/fastas/"
#create list of single-locus alignment file names
sl_listfile<-list.files(path = sl_aln_dir)
#open an empty list to store each alignment
sl_aln<-list()
#use a loop to read in each fasta alignment and store them all in the list named 'sl_aln'
for (i in 1:length(sl_listfile)){
#Read the alignments into local memory
sl_aln[[i]]<-read.dna(file = paste(sl_aln_dir, sl_listfile[i], sep = ""), format = "fasta")
}0.4 Bring in sample info and define ‘popmap’ variable
‘popmap’ is a dataframe mapping each sample in the alignment to one of 4 pre-designated species tips (details below). I read in and clean that dataframe using the following code:
#read in sampling sheet
popmap<-read.csv("~/Desktop/phil.dicaeum/dicaeum.retained.sampling.csv")[,c(1,5)]
#get exact names in the fasta alignment
fas<-names(read.FASTA("~/Desktop/phil.dicaeum/astral/fastas/locus.1.fasta"))
#add those to the sampling sheet and check that the order matches
popmap$id<-fas
#rename taxa to match the needed levels ('P1','P2','P3','OG')
popmap$pop<-gsub("Zamboanga","P1",gsub("Mindanao","P2",gsub("Luzon","P3",gsub("nigrilore","OG",popmap$Taxa))))
#subset out the relevant columns
popmap<-popmap[,c(3,4)]
#check that this looks as expected
head(popmap) id pop
1 hyp18070 P3
2 hyp18191 P1
3 hyp14037 P2
4 hyp14079 P2
5 hyp14065 P2
6 hyp14120 P2
0.5 Define a function to calculate topology frequencies and associated coalescent heights
The exact details of the required inputs for this function are explained above
#################
#define function#
#################
#define a function that will calculate your gene tree topology frequencies (excluding unresolved rooted 3 tip trees)
calc.heights.dummy<-function(sl_aln, popmap, quartet.samps, model, tree.builder, out.table=NULL){
#open dataframe to hold all results
top.dist<-data.frame()
#begin the random sampling loop
for (k in 1:quartet.samps){
#randomly downsample all individuals present in the alignment to a single tip from each of the four taxa
#P1<-sample(popmap$id[popmap$pop == "P1"], size = 1)
#P2<-sample(popmap$id[popmap$pop == "P2"], size = 1)
#P3<-sample(popmap$id[popmap$pop == "P3"], size = 1)
#P_out<-sample(popmap$id[popmap$pop == "OG"], size = 1)
P1<-popmap$id[2]
P2<-popmap$id[3]
P3<-popmap$id[1]
P_out<-popmap$id[55]
#open empty vectors to hold key variables for the given sampling scheme
trees<-c()
topology<-c()
sister.dist<-c()
aln.num<-c()
#begin the fasta alignment loop
for (i in 1:length(sl_aln)){
#Prune the given alignment to contain only the sequence of interest (i) and the four randomly selected tips
sl_aln_pruned<-sl_aln[[i]][c(P1, P2, P3, P_out),]
#convert this alignment to the proper class
dist_mat<-dist.dna(sl_aln_pruned, as.matrix = TRUE, model = model)
#Get the neighbor joining topology of tree
if(tree.builder == "nj"){tree<-nj(dist_mat)}
else if(tree.builder == "upgma"){tree<-upgma(dist_mat)}
#Root the tree
root_tree<-root.phylo(tree, outgroup = grep(P_out,tree$tip.label), resolve.root = TRUE)
#record the tree topology
trees[i]<-write.tree(root_tree)
#Get the topology of the full nj tree and record the pairwise distance between the two taxa recovered as sister (a proxy for coalescent height)
if(root_tree$edge.length[root_tree$edge[,2] > Ntip(root_tree)][2] == 0){
topology[i]<-"unresolved"
sister.dist[i]<-NA
}
else if(is.monophyletic(tree, tips = c(1, 2))){
topology[i]<-"12top"
sister.dist[i]<-dist_mat[1,2]
}
else if(is.monophyletic(tree, tips = c(2, 3))){
topology[i]<-"23top"
sister.dist[i]<-dist_mat[2,3]
}
else if(is.monophyletic(tree, tips = c(1, 3))){
topology[i]<-"13top"
sister.dist[i]<-dist_mat[1,3]
}
#record the alignment number
aln.num[i]<-i
} #end the loop that iterates over each input fasta alignment
#combine the results of the gene tree loop into a dataframe that stores all the information plus records the random sampling iteration
top.dist<-rbind(top.dist,
data.frame(rand.samp=rep(k, times=length(aln.num)),
alignment=aln.num,
topology=topology,
tree=trees,
sister.dist=sister.dist))
} #end random sampling loop
#
print((table(top.dist$topology)))
#plot
#coal height of sister relationship in gene trees matching the 35% topology
hist(top.dist$sister.dist[top.dist$topology == "12top"], breaks=20, main="pairwise dist P1-P2 in (P1,P2) gene trees",
xlab = paste(model, tree.builder, "pairwise dist. mean =",round(mean(top.dist$sister.dist[top.dist$topology == "12top"], na.rm = T), 4)))
#coal height of sister relationship in gene trees matching the 34% topology
hist(top.dist$sister.dist[top.dist$topology == "23top"], breaks=20, main="pairwise dist P2-P3 in (P2,P3) gene trees",
xlab = paste(model, tree.builder, "pairwise dist. mean =",round(mean(top.dist$sister.dist[top.dist$topology == "23top"], na.rm = T), 4)))
#coal height of sister relationship in gene trees matching the 31% topology
hist(top.dist$sister.dist[top.dist$topology == "13top"], breaks=20, main="pairwise dist P1-P3 in (P1,P3) gene trees",
xlab = paste(model, tree.builder, "pairwise dist. mean =",round(mean(top.dist$sister.dist[top.dist$topology == "13top"], na.rm = T), 4)))
return(top.dist)
#write this info to disk if requested by user
if(!is.null(out.table)){write.table(top.dist, file = out.table, sep = "\t", quote = F, row.names = F)}
} #close function0.6 run the function specifying “raw” divergence and “neighbor-joining” tree building
This function is specified as ‘dummy’ because it is not actually doing the random sampling, it is just picking the same individuals each time, so that we can determine whether the models chosen are affecting our inferences on the output
raw.nj<-calc.heights.dummy(sl_aln, popmap, quartet.samps=1, model="raw", tree.builder="nj")
12top 13top 23top unresolved
173 46 70 2301
0.7 “F84” divergence and “neighbor-joining” tree building
F84.nj<-calc.heights.dummy(sl_aln, popmap, quartet.samps=1, model="F84", tree.builder="nj")
12top 13top 23top unresolved
235 132 87 2136
0.8 “raw” divergence and “UPGMA” tree building
raw.upgma<-calc.heights.dummy(sl_aln, popmap, quartet.samps=1, model="raw", tree.builder="upgma", out.table)
12top 13top 23top unresolved
141 99 185 2135
0.9 “F84” divergence and “UPGMA” tree building
F84.upgma<-calc.heights.dummy(sl_aln, popmap, quartet.samps=1, model="F84", tree.builder="upgma", out.table)
12top 13top 23top unresolved
128 98 194 2134
redefine the dummy function using a sample from mt busa and see if that changes things
#################
#define function#
#################
#define a function that will calculate your gene tree topology frequencies (excluding unresolved rooted 3 tip trees)
calc.heights.dummy<-function(sl_aln, popmap, quartet.samps, model, tree.builder, out.table=NULL){
#open dataframe to hold all results
top.dist<-data.frame()
#begin the random sampling loop
for (k in 1:quartet.samps){
#randomly downsample all individuals present in the alignment to a single tip from each of the four taxa
#P1<-sample(popmap$id[popmap$pop == "P1"], size = 1)
#P2<-sample(popmap$id[popmap$pop == "P2"], size = 1)
#P3<-sample(popmap$id[popmap$pop == "P3"], size = 1)
#P_out<-sample(popmap$id[popmap$pop == "OG"], size = 1)
P1<-popmap$id[2]
P2<-popmap$id[23]
P3<-popmap$id[1]
P_out<-popmap$id[55]
#open empty vectors to hold key variables for the given sampling scheme
trees<-c()
topology<-c()
sister.dist<-c()
aln.num<-c()
#begin the fasta alignment loop
for (i in 1:length(sl_aln)){
#Prune the given alignment to contain only the sequence of interest (i) and the four randomly selected tips
sl_aln_pruned<-sl_aln[[i]][c(P1, P2, P3, P_out),]
#convert this alignment to the proper class
dist_mat<-dist.dna(sl_aln_pruned, as.matrix = TRUE, model = model)
#Get the neighbor joining topology of tree
if(tree.builder == "nj"){tree<-nj(dist_mat)}
else if(tree.builder == "upgma"){tree<-upgma(dist_mat)}
#Root the tree
root_tree<-root.phylo(tree, outgroup = grep(P_out,tree$tip.label), resolve.root = TRUE)
#record the tree topology
trees[i]<-write.tree(root_tree)
#Get the topology of the full nj tree and record the pairwise distance between the two taxa recovered as sister (a proxy for coalescent height)
if(root_tree$edge.length[root_tree$edge[,2] > Ntip(root_tree)][2] == 0){
topology[i]<-"unresolved"
sister.dist[i]<-NA
}
else if(is.monophyletic(tree, tips = c(1, 2))){
topology[i]<-"12top"
sister.dist[i]<-dist_mat[1,2]
}
else if(is.monophyletic(tree, tips = c(2, 3))){
topology[i]<-"23top"
sister.dist[i]<-dist_mat[2,3]
}
else if(is.monophyletic(tree, tips = c(1, 3))){
topology[i]<-"13top"
sister.dist[i]<-dist_mat[1,3]
}
#record the alignment number
aln.num[i]<-i
} #end the loop that iterates over each input fasta alignment
#combine the results of the gene tree loop into a dataframe that stores all the information plus records the random sampling iteration
top.dist<-rbind(top.dist,
data.frame(rand.samp=rep(k, times=length(aln.num)),
alignment=aln.num,
topology=topology,
tree=trees,
sister.dist=sister.dist))
} #end random sampling loop
#
print((table(top.dist$topology)))
#plot
#coal height of sister relationship in gene trees matching the 35% topology
hist(top.dist$sister.dist[top.dist$topology == "12top"], breaks=20, main="pairwise dist P1-P2 in (P1,P2) gene trees",
xlab = paste(model, tree.builder, "pairwise dist. mean =",round(mean(top.dist$sister.dist[top.dist$topology == "12top"], na.rm = T), 4)))
#coal height of sister relationship in gene trees matching the 34% topology
hist(top.dist$sister.dist[top.dist$topology == "23top"], breaks=20, main="pairwise dist P2-P3 in (P2,P3) gene trees",
xlab = paste(model, tree.builder, "pairwise dist. mean =",round(mean(top.dist$sister.dist[top.dist$topology == "23top"], na.rm = T), 4)))
#coal height of sister relationship in gene trees matching the 31% topology
hist(top.dist$sister.dist[top.dist$topology == "13top"], breaks=20, main="pairwise dist P1-P3 in (P1,P3) gene trees",
xlab = paste(model, tree.builder, "pairwise dist. mean =",round(mean(top.dist$sister.dist[top.dist$topology == "13top"], na.rm = T), 4)))
return(top.dist)
#write this info to disk if requested by user
if(!is.null(out.table)){write.table(top.dist, file = out.table, sep = "\t", quote = F, row.names = F)}
} #close function0.10 run the function specifying “raw” divergence and “neighbor-joining” tree building
This function is specified as ‘dummy’ because it is not actually doing the random sampling, it is just picking the same individuals each time, so that we can determine whether the models chosen are affecting our inferences on the output
raw.nj<-calc.heights.dummy(sl_aln, popmap, quartet.samps=1, model="raw", tree.builder="nj")
12top 13top 23top unresolved
169 31 64 2326
0.11 “F84” divergence and “neighbor-joining” tree building
F84.nj<-calc.heights.dummy(sl_aln, popmap, quartet.samps=1, model="F84", tree.builder="nj")
12top 13top 23top unresolved
220 126 73 2171
0.12 “raw” divergence and “UPGMA” tree building
raw.upgma<-calc.heights.dummy(sl_aln, popmap, quartet.samps=1, model="raw", tree.builder="upgma", out.table)
12top 13top 23top unresolved
160 74 151 2173
0.13 “F84” divergence and “UPGMA” tree building
F84.upgma<-calc.heights.dummy(sl_aln, popmap, quartet.samps=1, model="F84", tree.builder="upgma", out.table)
12top 13top 23top unresolved
150 79 155 2170
define the real function allowing for random quartet sampling
#################
#define function#
#################
#define a function that will calculate your gene tree topology frequencies (excluding unresolved rooted 3 tip trees)
calc.heights<-function(sl_aln, popmap, quartet.samps, model, tree.builder, out.table=NULL){
#open dataframe to hold all results
top.dist<-data.frame()
#begin the random sampling loop
for (k in 1:quartet.samps){
#randomly downsample all individuals present in the alignment to a single tip from each of the four taxa
P1<-sample(popmap$id[popmap$pop == "P1"], size = 1)
P2<-sample(popmap$id[popmap$pop == "P2"], size = 1)
P3<-sample(popmap$id[popmap$pop == "P3"], size = 1)
P_out<-sample(popmap$id[popmap$pop == "OG"], size = 1)
#open empty vectors to hold key variables for the given sampling scheme
trees<-c()
topology<-c()
sister.dist<-c()
aln.num<-c()
#begin the fasta alignment loop
for (i in 1:length(sl_aln)){
#Prune the given alignment to contain only the sequence of interest (i) and the four randomly selected tips
sl_aln_pruned<-sl_aln[[i]][c(P1, P2, P3, P_out),]
#convert this alignment to the proper class
dist_mat<-dist.dna(sl_aln_pruned, as.matrix = TRUE, model = model)
#Get the neighbor joining topology of tree
if(tree.builder == "nj"){tree<-nj(dist_mat)}
else if(tree.builder == "upgma"){tree<-upgma(dist_mat)}
#Root the tree
root_tree<-root.phylo(tree, outgroup = grep(P_out,tree$tip.label), resolve.root = TRUE)
#record the tree topology
trees[i]<-write.tree(root_tree)
#Get the topology of the full nj tree and record the pairwise distance between the two taxa recovered as sister (a proxy for coalescent height)
if(root_tree$edge.length[root_tree$edge[,2] > Ntip(root_tree)][2] == 0){
topology[i]<-"unresolved"
sister.dist[i]<-NA
}
else if(is.monophyletic(tree, tips = c(1, 2))){
topology[i]<-"12top"
sister.dist[i]<-dist_mat[1,2]
}
else if(is.monophyletic(tree, tips = c(2, 3))){
topology[i]<-"23top"
sister.dist[i]<-dist_mat[2,3]
}
else if(is.monophyletic(tree, tips = c(1, 3))){
topology[i]<-"13top"
sister.dist[i]<-dist_mat[1,3]
}
#record the alignment number
aln.num[i]<-i
} #end the loop that iterates over each input fasta alignment
#combine the results of the gene tree loop into a dataframe that stores all the information plus records the random sampling iteration
top.dist<-rbind(top.dist,
data.frame(rand.samp=rep(k, times=length(aln.num)),
alignment=aln.num,
topology=topology,
tree=trees,
sister.dist=sister.dist))
} #end random sampling loop
#
print((table(top.dist$topology)))
#plot
#coal height of sister relationship in gene trees matching the 35% topology
hist(top.dist$sister.dist[top.dist$topology == "12top"], breaks=20, main="pairwise dist P1-P2 in (P1,P2) gene trees",
xlab = paste(model, tree.builder, "pairwise dist. mean =",round(mean(top.dist$sister.dist[top.dist$topology == "12top"], na.rm = T), 4)))
#coal height of sister relationship in gene trees matching the 34% topology
hist(top.dist$sister.dist[top.dist$topology == "23top"], breaks=20, main="pairwise dist P2-P3 in (P2,P3) gene trees",
xlab = paste(model, tree.builder, "pairwise dist. mean =",round(mean(top.dist$sister.dist[top.dist$topology == "23top"], na.rm = T), 4)))
#coal height of sister relationship in gene trees matching the 31% topology
hist(top.dist$sister.dist[top.dist$topology == "13top"], breaks=20, main="pairwise dist P1-P3 in (P1,P3) gene trees",
xlab = paste(model, tree.builder, "pairwise dist. mean =",round(mean(top.dist$sister.dist[top.dist$topology == "13top"], na.rm = T), 4)))
return(top.dist)
#write this info to disk if requested by user
if(!is.null(out.table)){write.table(top.dist, file = out.table, sep = "\t", quote = F, row.names = F)}
} #close function0.14 run the function specifying “raw” divergence and “neighbor-joining” tree building
each time sample 10 random quartets and give the combined output across all iterations
raw.nj<-calc.heights(sl_aln, popmap, quartet.samps=10, model="raw", tree.builder="nj")
12top 13top 23top unresolved
1414 448 645 23393
0.15 “F84” divergence and “neighbor-joining” tree building
F84.nj<-calc.heights(sl_aln, popmap, quartet.samps=10, model="F84", tree.builder="nj")
12top 13top 23top unresolved
1970 1160 833 21937
0.16 “raw” divergence and “UPGMA” tree building
raw.upgma<-calc.heights(sl_aln, popmap, quartet.samps=10, model="raw", tree.builder="upgma", out.table)
12top 13top 23top unresolved
1228 876 1460 22069
0.17 “F84” divergence and “UPGMA” tree building
F84.upgma<-calc.heights(sl_aln, popmap, quartet.samps=10, model="F84", tree.builder="upgma", out.table)
12top 13top 23top unresolved
1164 842 1567 22043
0.18 Plot the distribution of coal heights under a ‘raw NJ’ framework across 100 random quartet samples
#run 100 random samples
raw.nj<-calc.heights(sl_aln, popmap, quartet.samps=100, model="raw", tree.builder="nj")
12top 13top 23top unresolved
13263 4394 6291 235052
#extract mean of each rep for each topology and plot histograms
dist12<-c()
dist23<-c()
dist13<-c()
for (i in 1:length(levels(as.factor(raw.nj$rand.samp)))){
#isolate this replicate
dist12[i]<-mean(raw.nj$sister.dist[raw.nj$rand.samp == i & raw.nj$topology == "12top"])
dist23[i]<-mean(raw.nj$sister.dist[raw.nj$rand.samp == i & raw.nj$topology == "23top"])
dist13[i]<-mean(raw.nj$sister.dist[raw.nj$rand.samp == i & raw.nj$topology == "13top"])
}
df<-data.frame(replicate=c(1:length(levels(as.factor(raw.nj$rand.samp)))),
dist12.means=dist12,
dist23.means=dist23,
dist13.means=dist13)
c1 <- rgb(173,216,230,max = 255, alpha = 80, names = "lt.blue")
c2 <- rgb(255,192,203, max = 255, alpha = 80, names = "lt.pink")
c3 <- rgb(0,0,0, max = 255, alpha = 80, names = "lt.pink")
hist(df$dist12.means, breaks = seq(0,0.02,.001), col=c1, xlim = c(0,0.02), xaxp=c(0,0.02,5), xlab = "mean raw dist", main = "sister topology coalescent heights\nblue=(P1,P2), pink=(P2,P3), gray=(P1,P3)")
abline(v=mean(df$dist12.means), lty="dashed")
hist(df$dist23.means, breaks = seq(0,0.02,.001), col=c2, xlim = c(0,0.02), xaxp=c(0,0.02,5), add=TRUE)
abline(v=mean(df$dist23.means), lty="dashed")
hist(df$dist13.means, breaks = seq(0,0.02,.001), col=c3, xlim = c(0,0.02), xaxp=c(0,0.02,5), add=TRUE)
abline(v=mean(df$dist13.means), lty="dashed")hist(df$dist12.means, breaks = seq(0,0.02,.001), col=c1, xlim = c(0,0.02), xaxp=c(0,0.02,5), xlab = "mean raw dist", main = "sister topology coalescent heights\nblue=(P1,P2), pink=(P2,P3), gray=(P1,P3)")
abline(v=mean(df$dist12.means), lty="dashed")
hist(df$dist23.means, breaks = seq(0,0.02,.001), col=c2, xlim = c(0,0.02), xaxp=c(0,0.02,5), add=TRUE)
abline(v=mean(df$dist23.means), lty="dashed")